
# R script for repeating main ranking exercise by faculty status (rank)
# This file replicates appendix Tables A6-A11

rm(list = ls())

if (!require('tidyverse')) install.packages('tidyverse')
if (!require('haven')) install.packages('haven')
if (!require('knitr')) install.packages('knitr')
if (!require('kableExtra')) install.packages('kableExtra')

export_tables <- TRUE
if (export_tables & !dir.exists('Tables/')) dir.create('Tables/')

# Load data ---------------------------------------------------------------

# Make sure the raw dataset is placed in the same (sub)dir as this script

dat_raw <- readRDS("Dataset_raw.RDS")

# Add measures ------------------------------------------------------------

# Make sure the university spreadsheet is placed in the same (sub)dir as this script

## add years of publication ----
dat_raw$year = ifelse(dat_raw$year_forthcoming==1, 2023, dat_raw$year)

## add years since PhD ----
dat_raw$phd_year_passed = dat_raw$year - dat_raw$phd_year + 1

## add Ttop-5 polisci publications ----
dat_raw$polisci_top5 = 0
dat_raw$polisci_top5[dat_raw$journal=="AMERICAN POLITICAL SCIENCE REVIEW"] <- 2
dat_raw$polisci_top5[dat_raw$journal %in% c("AMERICAN JOURNAL OF POLITICAL SCIENCE", "JOURNAL OF POLITICS", "INTERNATIONAL ORGANIZATION","WORLD POLITICS")] <- 1

## add recent five years indicator ----
dat_raw$year_recent5 = ifelse(dat_raw$year>=2018 & dat_raw$year<=2022, 1, 0)

## add no. of coauthors ----
dat_raw$no_coauthors = dat_raw$no_coauthors + 1
dat_raw$no_coauthors[is.na(dat_raw$no_coauthors)] <- 1

## add university labels (abbvr.) ----
ls_uni <- readxl::read_xlsx("Dictionary_universities.xlsx") |>
  dplyr::select(region, university, university_abbr) |>
  dplyr::distinct()
dat_raw = dat_raw |>
  dplyr::left_join(ls_uni, by = c("region", "university")) |> 
  dplyr::relocate(university_abbr, .after = university) |>
  dplyr::arrange(region, university, department, name, year)

# Flag faculty status (3-class) -------------------------------------------

dat_raw$rank_full = as.integer(!is.na(dat_raw$rank) & dat_raw$rank=='full')
dat_raw$rank_assoc = as.integer(!is.na(dat_raw$rank) & dat_raw$rank=='associate')
dat_raw$rank_assist = as.integer(!is.na(dat_raw$rank) & dat_raw$rank=='assistant')

# Subset data by faculty status -------------------------------------------

dat_raw_full <- dat_raw[dat_raw$rank_full==1,]
dat_raw_assoc <- dat_raw[dat_raw$rank_assoc==1,]
dat_raw_assist <- dat_raw[dat_raw$rank_assist==1,]

dat_raw_rank3 <- list(
  'full' = dat_raw_full,
  'associate' = dat_raw_assoc,
  'assistant' = dat_raw_assist
)

# Compute department-level measures ---------------------------------------

## define summarizing helpers ----

make_levelFalc_data <- function(df_raw) {
  dat_faculty = df_raw |>
    dplyr::group_by(region, country, university, university_abbr, name) |>
    dplyr::summarise(
      # faculty status: full/part-time
      status = mean(status, na.rm = TRUE),
      # set i: unweighted measures
      ## citations: all and recent
      citations = sum(cites, na.rm = TRUE),
      citations_recent = sum(cites*year_recent5, na.rm = TRUE),
      ## impact: all and recent
      impact = sum(fiveyearjif_historic, na.rm = TRUE),
      impact_recent = sum(fiveyearjif_historic*year_recent5, na.rm = TRUE),
      ## top publications: all and recent
      pub_top = sum(polisci_top5, na.rm = TRUE),
      pub_top_recent = sum(polisci_top5*year_recent5, na.rm = TRUE),
      # set ii: measures weighted by historic impact and coauthor headcount
      ## citations: x historic JIF / no. of coauthors
      citations_historic_ca = sum(cites*jif_historic/no_coauthors, na.rm = TRUE),
      citations_recent_historic_ca = sum(cites*year_recent5*jif_historic/no_coauthors, na.rm = TRUE),
      ## impact: historic JIF / no. of coauthors
      impact_historic_ca = sum(fiveyearjif_historic/no_coauthors, na.rm = TRUE),
      impact_recent_historic_ca = sum(fiveyearjif_historic*year_recent5/no_coauthors, na.rm = TRUE),
      ## top publications: x historic JIF / no. of coauthors
      pub_top_historic_ca = sum(polisci_top5*jif_historic/no_coauthors, na.rm = TRUE),
      pub_top_recent_historic_ca = sum(polisci_top5*year_recent5*jif_historic/no_coauthors, na.rm = TRUE),
    ) |>
    dplyr::ungroup()
}

make_levelDept_data <- function(df_falc) {
  df_falc |>
    dplyr::group_by(region, country, university, university_abbr) |>
    dplyr::summarise(
      dept_size = sum(status,na.rm = TRUE),
      across(.cols= citations:pub_top_recent_historic_ca, .fns = ~sum(.x, na.rm = TRUE))
    ) |>
    dplyr::ungroup() |>
    dplyr::mutate(across( 
      .cols = citations:pub_top_recent_historic_ca, 
      .fns = function(x) x/dept_size,
      .names = "{.col}_pf"
    ))
}

pipeline_levelDept_data <- function(DF_raw) {
  make_levelFalc_data(df_raw = DF_raw) |>
    make_levelDept_data()
}

## Create department-level summaries ----

dat_dept_rank3 <- lapply(dat_raw_rank3, pipeline_levelDept_data)

# Rank departments by faculty status --------------------------------------

## define ranking helpers ----

score_to_league = function(DF, Var, Var_out, Top_N = 10, IDs = c("region","country", "university_abbr"),  Decimal = 0) {
  
  # inner helpers
  rank_by_score = function(df, var, ids = c("region","country", "university_abbr")) { 
    rank = rank(-df[[var]], ties.method = "random")
    ranks = data.frame(df[,ids], rank)
    colnames(ranks) = c(ids, var)
    return(ranks)
  }
  
  choose_top20_schools = function(df_rank, var, top_n = 10, ids = c("region","country", "university_abbr")) { 
    df_rank_topK = df_rank[df_rank[[var]]<=top_n, c(ids, var)]
    names(df_rank_topK) <- c(ids, "rank")
    df_rank_topK = df_rank_topK[order(df_rank_topK[["rank"]]),]
    return(df_rank_topK)
  }
  
  annotate_uni_by_score = function(df_rank, df_score, var, var_out, ids = c("region","country", "university_abbr"), decimal = 0) { 
    df_score = df_score |> dplyr::filter(university_abbr %in% df_rank[["university_abbr"]]) 
    if (decimal==0) df_score[[var]] = as.integer(df_score[[var]]) 
    else df_score[[var]] = round(df_score[[var]], decimal) 
    df_rankscore = dplyr::left_join(df_rank, df_score, by = ids)
    df_rankscore$measure_formatted = trimws(format(df_rankscore[[var]], nsmall=decimal, big.mark=",")) 
    df_rankscore$university_annotated = paste0(trimws(df_rankscore[["university_abbr"]]), " (", df_rankscore[["measure_formatted"]], ")")
    df_rankscore2 = df_rankscore[,c("rank", "university_annotated")]
    names(df_rankscore2) <- c("Rank", var_out)
    return(df_rankscore2)
  }
  
  # pipeline exe
  rank_by_score(df = DF, var = Var, ids = IDs) |> 
    choose_top20_schools(var = Var, top_n = Top_N, ids = IDs) |>
    annotate_uni_by_score(df_score = DF, var = Var, var_out = Var_out, ids = IDs, decimal = Decimal)
}

combine2stack_regional_ranks = function(ls_ranks, df_ID=IDs_region) { 
  Reduce(
    f = function(x,y) merge(x, y, all=TRUE, by=c("Region","Rank")), x = ls_ranks) |>
    (\(x) dplyr::left_join(df_ID, x, by=c("Region","Rank")))() |>
    dplyr::mutate(
      Citations = case_when(is.na(Citations)~"",TRUE~Citations),
      `Recent Citations` = case_when(is.na(`Recent Citations`)~"",TRUE~`Recent Citations`),
      Impact = case_when(is.na(Impact)~"",TRUE~Impact),
      `Recent Impact` = case_when(is.na(`Recent Impact`)~"",TRUE~`Recent Impact`),
      `Top Publications` = case_when(is.na(`Top Publications`)~"",TRUE~`Top Publications`),
      `Recent Top Publications` = case_when(is.na(`Recent Top Publications`)~"",TRUE~`Recent Top Publications`)
    ) |>
    dplyr::select(
      Region, Rank,
      Citations, Impact, `Top Publications`,
      `Recent Citations`, `Recent Impact`, `Recent Top Publications`
    ) |>
    dplyr::select(-Region)
}

make_regional_tables = function(tab_df, out_format="latex") { 
  kableExtra::kbl(
    tab_df, format = out_format, booktabs = TRUE, linesep = ""
  ) |>
    kableExtra::pack_rows(
      index = c("North America"=10,"Europe"=10,"Asia"=10,"Oceania"=10,"Latin America"=10,"Africa"=2), 
      bold=TRUE, italic=TRUE, hline_before = TRUE, hline_after = TRUE)
}

pipeline_make_ranking_tableI <- function(ls_dept_region, drop_toppubs_AFR = FALSE) {
  
  ## Columns 1-2: Citations 
  rank_citation_region = ls_dept_region |>
    purrr::map(~score_to_league(DF = .x, Var = "citations", Var_out = "Citations", Top_N = 10)) |>
    dplyr::bind_rows(.id = "Region")
  rank_citation_recent_region = ls_dept_region |>
    purrr::map(~score_to_league(DF = .x, Var = "citations_recent", Var_out = "Recent Citations", Top_N = 10)) |>
    dplyr::bind_rows(.id = "Region")
  
  ## Columns 3-4: Impact 
  rank_impact_region = ls_dept_region |>
    purrr::map(~score_to_league(DF = .x, Var = "impact", Var_out = "Impact", Top_N = 10)) |>
    dplyr::bind_rows(.id = "Region")
  rank_impact_recent_region = ls_dept_region |>
    purrr::map(~score_to_league(DF = .x, Var = "impact_recent", Var_out = "Recent Impact", Top_N = 10)) |>
    dplyr::bind_rows(.id = "Region")
  
  ## Columns 5-6: Top Publications 
  if (drop_toppubs_AFR) {
    rank_top5_region = ls_dept_region |>
      purrr::map(~dplyr::filter(.x, !pub_top==0))  %>% 
      {.[!grepl("Africa", names(.))]} |>
      purrr::map(~score_to_league(DF = .x, Var = "pub_top", Var_out = "Top Publications", Top_N = 10)) |>
      dplyr::bind_rows(.id = "Region")
  } else {
    rank_top5_region = ls_dept_region |>
      purrr::map(~dplyr::filter(.x, !pub_top==0)) |>
      purrr::map(~score_to_league(DF = .x, Var = "pub_top", Var_out = "Top Publications", Top_N = 10)) |>
      dplyr::bind_rows(.id = "Region")
  } 
  rank_top5_recent_region = ls_dept_region |>
    purrr::map(~dplyr::filter(.x, !pub_top_recent==0)) %>% 
    {.[!grepl("Africa", names(.))]} |>
    purrr::map(~score_to_league(DF = .x, Var = "pub_top_recent", Var_out = "Recent Top Publications", Top_N = 10)) %>% 
    dplyr::bind_rows(.id = "Region")
  
  ## Full Table: Citations + Impact + Top Publications 
  list(
    rank_citation_region, rank_impact_region, rank_top5_region,
    rank_citation_recent_region, rank_impact_recent_region, rank_top5_recent_region
  ) |>
    combine2stack_regional_ranks() |>
    make_regional_tables()
}

pipeline_make_ranking_tableII <- function(ls_dept_region, drop_toppubs_AFR = FALSE) {
  
  ## Columns 1-2: Citations per Faculty 
  rank_citation_region2 = ls_dept_region |>
    purrr::map(~score_to_league(DF = .x, Var = "citations_pf", Var_out = "Citations", Top_N = 10)) |>
    dplyr::bind_rows(.id = "Region")
  rank_citation_recent_region2 = ls_dept_region |>
    purrr::map(~score_to_league(DF = .x, Var = "citations_recent_pf", Var_out = "Recent Citations", Top_N = 10)) |>
    dplyr::bind_rows(.id = "Region")
  
  ## Columns 3-4: Impact per Faculty 
  rank_impact_region2 = ls_dept_region |>
    purrr::map(~score_to_league(DF = .x, Var = "impact_pf", Var_out = "Impact", Top_N = 10, Decimal = 2)) |>
    dplyr::bind_rows(.id = "Region")
  rank_impact_recent_region2 = ls_dept_region |>
    purrr::map(~score_to_league(DF = .x, Var = "impact_recent_pf", Var_out = "Recent Impact", Top_N = 10, Decimal = 2)) |>
    dplyr::bind_rows(.id = "Region")
  
  ## Columns 5-6: Top Publications per Faculty 
  if (drop_toppubs_AFR) {
    rank_top5_region2 = ls_dept_region |>
      purrr::map(~dplyr::filter(.x, !pub_top==0)) %>% 
      {.[!grepl("Africa", names(.))]} |>
      purrr::map(~score_to_league(DF = .x, Var = "pub_top_pf", Var_out = "Top Publications", Top_N = 10, Decimal = 2)) |>
      dplyr::bind_rows(.id = "Region")
  } else {
    rank_top5_region2 = ls_dept_region |>
      purrr::map(~dplyr::filter(.x, !pub_top==0)) |>
      purrr::map(~score_to_league(DF = .x, Var = "pub_top_pf", Var_out = "Top Publications", Top_N = 10, Decimal = 2)) |>
      dplyr::bind_rows(.id = "Region")
  }
  rank_top5_recent_region2 = ls_dept_region |>
    purrr::map(~dplyr::filter(.x, !pub_top_recent==0)) %>% 
    {.[!grepl("Africa", names(.))]} |>
    purrr::map(~score_to_league(DF = .x, Var = "pub_top_recent_pf", Var_out = "Recent Top Publications", Top_N = 10, Decimal = 2)) %>% 
    dplyr::bind_rows(.id = "Region")
  
  ## Full Table: Citations + Impact + Top Publications per Faculty 
  list(
    rank_citation_region2, rank_impact_region2, rank_top5_region2,
    rank_citation_recent_region2, rank_impact_recent_region2, rank_top5_recent_region2
  ) |>
    combine2stack_regional_ranks() |>
    make_regional_tables()
  
}

## set region list ----

names_regions = c("North America", "Europe", "Asia", "Oceania", "Latin America", "Africa")
IDs_region <- data.frame( 
  "Region" =   rep(names_regions, times = c(10,10,10,10,10,2)), 
  "Rank" = c(rep(1:10,5),1:2)
)

## split subsets by region ----

ls_dept_rank3 <- dat_dept_rank3 |>
  purrr::map(function(df) split(df, f = df$region)) |>
  purrr::map(function(ls) ls[names_regions])

## Tables A6-A8: Top-10 by Region by Faculty Status ----

RankTabA6toA8 <- purrr::map2(ls_dept_rank3, c(FALSE, TRUE, TRUE), ~pipeline_make_ranking_tableI(.x, .y)) 
if (export_tables) purrr::map2(RankTabA6toA8, paste0("Tables/TableA", 6:8, '.tex'), ~writeLines(.x, .y))

## Tables A9-A11: Top-10 Per Capita by Region by Faculty Status ----

RankTabA9toA11 <- purrr::map2(ls_dept_rank3, c(FALSE, TRUE, TRUE), ~pipeline_make_ranking_tableII(.x, .y)) 
if (export_tables) purrr::map2(RankTabA9toA11, paste0("Tables/TableA", 9:11, '.tex'), ~writeLines(.x, .y))

# END #
